# This Python 3 environment comes with many helpful analytics libraries installed
# For example, here's several helpful packages to load
import numpy as np # for performing linear algebraic operations
import pandas as pd # for performing data processing, CSV file I/O (e.g. pd.read_csv)
from pandas import DataFrame # importing dataframe from pandas to work with dataframe
from scipy.spatial import distance # to find the euclidean distances between arrays
import warnings # to report the warning scenarios
from sklearn.metrics import classification_report, confusion_matrix # to measure the quality of the model predictions
import matplotlib.pyplot as plt # to work with plots
import seaborn as sns # to work with statistical graphs
import plotly.express as px # to work with matrix/ column oriented data
from sklearn.feature_selection import SelectKBest # Imported for feature selection technique
from sklearn.feature_selection import chi2 # Imported for feature selection technique
from sklearn.preprocessing import MinMaxScaler # to convert the features to given range
from collections import Counter # to count the total number of values as dictionary
from imblearn.over_sampling import SMOTE,ADASYN # used to distribute the data evenly over sampling technique
from sklearn.preprocessing import RobustScaler,StandardScaler,LabelEncoder,LabelBinarizer # imported for transforming the value
from sklearn.preprocessing import StandardScaler # To perform standard scaling
# Reading and checking the shape of the data
data = pd.read_csv('fifa21.csv')
data.shape
(18541, 92)
data.head()
| Unnamed: 0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Penalties | Composure | Defensive Awareness | Standing Tackle | Sliding Tackle | GK Diving | GK Handling | GK Kicking | GK Positioning | GK Reflexes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 253283 | Facundo Pellistri | 18 | https://cdn.sofifa.com/players/253/283/20_60.png | Uruguay | https://cdn.sofifa.com/flags/uy.png | 71 | 87 | Peñarol | ... | 66.0 | 61.0 | 35.0 | 11.0 | 18.0 | 9.0 | 12.0 | 7.0 | 8.0 | 7.0 |
| 1 | 1 | 179813 | Edinson Cavani | 32 | https://cdn.sofifa.com/players/179/813/20_60.png | Uruguay | https://cdn.sofifa.com/flags/uy.png | 86 | 86 | Paris Saint-Germain | ... | 85.0 | 80.0 | 57.0 | 48.0 | 39.0 | 12.0 | 5.0 | 13.0 | 13.0 | 10.0 |
| 2 | 2 | 245541 | Giovanni Reyna | 17 | https://cdn.sofifa.com/players/245/541/20_60.png | United States | https://cdn.sofifa.com/flags/us.png | 68 | 87 | Borussia Dortmund | ... | 50.0 | 59.0 | 30.0 | 23.0 | 24.0 | 10.0 | 13.0 | 14.0 | 12.0 | 7.0 |
| 3 | 3 | 233419 | Raphael Dias Belloli | 23 | https://cdn.sofifa.com/players/233/419/20_60.png | Brazil | https://cdn.sofifa.com/flags/br.png | 81 | 85 | Stade Rennais FC | ... | 73.0 | 79.0 | 45.0 | 54.0 | 38.0 | 8.0 | 7.0 | 13.0 | 8.0 | 14.0 |
| 4 | 4 | 198710 | James Rodríguez | 28 | https://cdn.sofifa.com/players/198/710/20_60.png | Colombia | https://cdn.sofifa.com/flags/co.png | 82 | 82 | Everton | ... | 81.0 | 87.0 | 52.0 | 41.0 | 44.0 | 15.0 | 15.0 | 15.0 | 5.0 | 14.0 |
5 rows × 92 columns
data.columns
Index(['Unnamed: 0', 'ID', 'Name', 'Age', 'Photo', 'Nationality', 'Flag',
'Overall', 'Potential', 'Club', 'Club Logo', 'Value', 'Wage', 'Special',
'Preferred Foot', 'Weak Foot', 'Skill Moves',
'International Reputation', 'Work Rate', 'Body Type', 'Real Face',
'Release Clause', 'Position', 'Jersey Number', 'Joined',
'Contract Valid Until', 'Height', 'Weight', 'LS', 'ST', 'RS', 'LW',
'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM',
'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB',
'GK', 'Likes', 'Dislikes', 'Following', 'Crossing', 'Finishing',
'Heading Accuracy', 'Short Passing', 'Volleys', 'Dribbling', 'Curve',
'FK Accuracy', 'Long Passing', 'Ball Control', 'Acceleration',
'Sprint Speed', 'Agility', 'Reactions', 'Balance', 'Shot Power',
'Jumping', 'Stamina', 'Strength', 'Long Shots', 'Aggression',
'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
'Defensive Awareness', 'Standing Tackle', 'Sliding Tackle', 'GK Diving',
'GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes'],
dtype='object')
Let's first consider the features which may influence the model.
features = ['Name', 'Age', 'Nationality', 'Overall', 'Potential', 'Club', 'Value', 'Wage', 'Special',
'Preferred Foot', 'Weak Foot', 'Skill Moves','International Reputation', 'Work Rate', 'Body Type',
'Position', 'Height', 'Weight', 'Likes', 'Dislikes', 'Following', 'Crossing', 'Finishing',
'Heading Accuracy', 'Short Passing', 'Volleys', 'Dribbling', 'Curve','FK Accuracy', 'Long Passing',
'Ball Control', 'Acceleration','Sprint Speed', 'Agility', 'Reactions', 'Balance', 'Shot Power',
'Jumping', 'Stamina', 'Strength', 'Long Shots', 'Aggression','Interceptions', 'Positioning',
'Vision', 'Penalties', 'Composure', 'Standing Tackle', 'Sliding Tackle',
'GK Diving','GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes']
# Incorporating those features in the dataframe
df = data[features]
df.shape
(18541, 54)
# Checking the column null values
df.isnull().sum()
Name 0 Age 0 Nationality 0 Overall 0 Potential 0 Club 371 Value 0 Wage 0 Special 0 Preferred Foot 0 Weak Foot 0 Skill Moves 0 International Reputation 0 Work Rate 0 Body Type 51 Position 25 Height 0 Weight 0 Likes 0 Dislikes 0 Following 0 Crossing 153 Finishing 209 Heading Accuracy 133 Short Passing 220 Volleys 163 Dribbling 183 Curve 120 FK Accuracy 44 Long Passing 138 Ball Control 214 Acceleration 221 Sprint Speed 244 Agility 226 Reactions 210 Balance 177 Shot Power 116 Jumping 204 Stamina 257 Strength 198 Long Shots 154 Aggression 95 Interceptions 150 Positioning 166 Vision 225 Penalties 36 Composure 652 Standing Tackle 130 Sliding Tackle 171 GK Diving 30 GK Handling 32 GK Kicking 32 GK Positioning 34 GK Reflexes 36 dtype: int64
# Histogram to see the distribution of the data
df.Volleys.hist()
<AxesSubplot:>
# Skew function to see how the data is skewed
print(df['Volleys'].skew())
df['Volleys'].describe()
-0.23510340268058694
count 18378.000000 mean 44.687997 std 17.746712 min 4.000000 25% 32.000000 50% 46.000000 75% 58.000000 max 90.000000 Name: Volleys, dtype: float64
# Skew function to see how the data is skewed
print(df['Balance'].skew())
df['Balance'].describe()
-0.6195177715778473
count 18364.000000 mean 64.528098 std 14.040552 min 17.000000 25% 57.000000 50% 67.000000 75% 74.000000 max 97.000000 Name: Balance, dtype: float64
# Inter Quartile Range check
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
Age 8.0 Overall 9.0 Potential 8.0 Value 1850000.0 Wage 9000.0 Special 319.0 Likes 6.0 Dislikes 1.0 Following 33.0 Crossing 25.0 Finishing 31.0 Heading Accuracy 20.0 Short Passing 14.0 Volleys 26.0 Dribbling 17.0 Curve 26.0 FK Accuracy 26.0 Long Passing 20.0 Ball Control 14.0 Acceleration 17.0 Sprint Speed 17.0 Agility 18.0 Reactions 12.0 Balance 17.0 Shot Power 20.0 Jumping 15.0 Stamina 17.0 Strength 16.0 Long Shots 29.0 Aggression 25.0 Interceptions 38.5 Positioning 24.0 Vision 19.0 Penalties 21.0 Composure 16.0 Standing Tackle 38.0 Sliding Tackle 39.0 GK Diving 6.0 GK Handling 6.0 GK Kicking 6.0 GK Positioning 6.0 GK Reflexes 6.0 dtype: float64
df['Volleys'].replace({np.NaN:data['Volleys'].mean()},inplace=True)
df['Curve'].replace({np.NaN:data['Curve'].mean()},inplace=True)
df['Agility'].replace({np.NaN:data['Agility'].mean()},inplace=True)
df['Balance'].replace({np.NaN:data['Balance'].mean()},inplace=True)
df['Jumping'].replace({np.NaN:data['Jumping'].mean()},inplace=True)
df['Interceptions'].replace({np.NaN:data['Interceptions'].mean()},inplace=True)
df['Positioning'].replace({np.NaN:data['Positioning'].mean()},inplace=True)
df['Composure'].replace({np.NaN:data['Composure'].mean()},inplace=True)
df['Sliding Tackle'].replace({np.NaN:data['Sliding Tackle'].mean()},inplace=True)
df['Crossing'].replace({np.NaN:data['Crossing'].mean()},inplace=True)
df['Finishing'].replace({np.NaN:data['Finishing'].mean()},inplace=True)
df['Heading Accuracy'].replace({np.NaN:data['Heading Accuracy'].mean()},inplace=True)
df['Short Passing'].replace({np.NaN:data['Short Passing'].mean()},inplace=True)
df['Dribbling'].replace({np.NaN:data['Dribbling'].mean()},inplace=True)
df['FK Accuracy'].replace({np.NaN:data['FK Accuracy'].mean()},inplace=True)
df['Long Passing'].replace({np.NaN:data['Long Passing'].mean()},inplace=True)
df['Ball Control'].replace({np.NaN:data['Ball Control'].mean()},inplace=True)
df['Acceleration'].replace({np.NaN:data['Acceleration'].mean()},inplace=True)
df['Sprint Speed'].replace({np.NaN:data['Sprint Speed'].mean()},inplace=True)
df['Reactions'].replace({np.NaN:data['Reactions'].mean()},inplace=True)
df['Shot Power'].replace({np.NaN:data['Shot Power'].mean()},inplace=True)
df['Stamina'].replace({np.NaN:data['Stamina'].mean()},inplace=True)
df['Strength'].replace({np.NaN:data['Strength'].mean()},inplace=True)
df['Long Shots'].replace({np.NaN:data['Long Shots'].mean()},inplace=True)
df['Aggression'].replace({np.NaN:data['Aggression'].mean()},inplace=True)
df['Vision'].replace({np.NaN:data['Vision'].mean()},inplace=True)
df['Penalties'].replace({np.NaN:data['Penalties'].mean()},inplace=True)
df['Standing Tackle'].replace({np.NaN:data['Standing Tackle'].mean()},inplace=True)
df['GK Diving'].replace({np.NaN:data['GK Diving'].mean()},inplace=True)
df['GK Handling'].replace({np.NaN:data['GK Handling'].mean()},inplace=True)
df['GK Kicking'].replace({np.NaN:data['GK Kicking'].mean()},inplace=True)
df['GK Positioning'].replace({np.NaN:data['GK Positioning'].mean()},inplace=True)
df['GK Reflexes'].replace({np.NaN:data['GK Reflexes'].mean()},inplace=True)
C:\Users\91989\anaconda\lib\site-packages\pandas\core\generic.py:6619: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return self._update_inplace(result)
nat_cnt=df.groupby('Nationality').apply(lambda x:x['Name'].count()).reset_index(name='Counts')
nat_cnt.sort_values(by='Counts',ascending=False,inplace=True)
top_20_nat_cnt=nat_cnt[:20]
fig=px.bar(top_20_nat_cnt,x='Nationality',y='Counts',color='Counts',title='Nationwise Representation in the FIFA Game')
fig.show()
Observations England has the highest number of players in FIFA 21 game. One of the major reasons in this regards is due to the EA franchise, which has predominatingly most user base in UK. Also in FIFA, English League has the most number of teams- generating the most number of players
cnt_best_avg=df.groupby('Nationality').apply(lambda x:np.average(x['Overall'])).reset_index(name='Overall Ratings')
cnt_best_cnt=df.groupby('Nationality').apply(lambda x:x['Overall'].count()).reset_index(name='Player Counts')
snt_best_avg_cnt=pd.merge(cnt_best_avg,cnt_best_cnt,how='inner',left_on='Nationality',right_on='Nationality')
sel_best_avg_cnt=snt_best_avg_cnt[snt_best_avg_cnt['Player Counts']>=200]
sel_best_avg_cnt.sort_values(by=['Overall Ratings','Player Counts'],ascending=[False,False])
px.scatter(sel_best_avg_cnt,x='Overall Ratings',y='Player Counts',color='Player Counts',size='Overall Ratings',hover_data=['Nationality'],title='Nationwise Player counts and Average Potential')
England and Brazil are the teams that deserve a mention in this aspect. England since it has produced 1856 players, and still is having an average of 63.28, while Brazil has the highest average Ratings among the players
clb_cnt=df.groupby('Club').apply(lambda x:x['Name'].count()).reset_index(name='Counts')
clb_cnt.sort_values(by='Counts',ascending=False,inplace=True)
fig=px.scatter(clb_cnt,x='Club',y='Counts',color='Counts',title='Clubwise Player counts in FIFA 21')
fig.show()
Bolton Wanderers with 48 players, Chelsea, Manchester United and AS Monaco have 45 players information for all the 3 clubs. Just a general observation, the average count of players for the English Premier League is more than any other leage. This shows the prioritization of English football by FIFA
cnt_best_avg=df.groupby('Club').apply(lambda x:np.average(x['Overall'])).reset_index(name='Overall Ratings')
cnt_best_cnt=df.groupby('Club').apply(lambda x:x['Overall'].count()).reset_index(name='Player Counts')
snt_best_avg_cnt=pd.merge(cnt_best_avg,cnt_best_cnt,how='inner',left_on='Club',right_on='Club')
sel_best_avg_cnt=snt_best_avg_cnt[snt_best_avg_cnt['Player Counts']>=25]
sel_best_avg_cnt.sort_values(by=['Overall Ratings','Player Counts'],ascending=[False,False])
px.scatter(sel_best_avg_cnt,x='Overall Ratings',y='Player Counts',color='Player Counts',size='Overall Ratings',hover_data=['Club'],title='Clubwise player counts and Average Potential')
As per the above chart, two teams deserve a special mention in this regard. Firstly it is Bayern Munich- The team which has the highest average rating among all the teams (81.46) from a set of 26 players. Another team is Real Madrid- which has the highest average among the teams with 45 players. They have an average of 79.06 on the 33 players
props=df[['Name','Nationality','Club','Height','Weight']]
props['Ht in ft']=pd.to_numeric(props['Height'].str[0])
props['Ht in in']=pd.to_numeric(props['Height'].str.split("\'").str[1].str.strip('"'))
props['Ht in cm']=(props['Ht in ft']*12+props['Ht in in'])*2.54
props['Weight in lb']=pd.to_numeric(props['Weight'].str.strip('lbs'))
fig=px.scatter(props,x='Weight in lb',y='Ht in cm',color='Ht in cm',size='Weight in lb',hover_data=['Name','Nationality','Club'],title='Height vs Weight Variation of the players in FIFA 21')
fig.show()
<ipython-input-17-327b0aaca2bd>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-17-327b0aaca2bd>:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-17-327b0aaca2bd>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-17-327b0aaca2bd>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Generally for a healthy football player, the height and weight are in a proportion. Else he/she will be too weak/heavy- and not have the peak fitness form. This is seen from the above scatter plot.
pos_cnt=df.groupby('Position').apply(lambda x:x['Name'].count()).reset_index(name='Counts')
pos_cnt.sort_values(by='Counts',ascending=False,inplace=True)
top_20_pos_cnt=pos_cnt[:20]
fig=px.bar(top_20_pos_cnt,x='Position',y='Counts',color='Counts',title='Positionwise Player counts in FIFA 21')
fig.show()
# position abbreviation for reference
# 'Right Midfielder', 'Striker', 'Left Midfielder', 'Right Wing',
# 'Left Centre Midfielder', 'Right Safety', 'Running Back',
# 'Left Winger', 'Centre Midfielder', 'Cornerback',
# 'Centre Defensive Midfielder', 'Centre Attacking Midfielder',
# 'Linebacker', 'Right Attacking Midfielder',
# 'Right Centre Midfielder', 'Right Centre Back', 'Right Wing Back',
# 'Left Defensive Midfielder', 'Left Attacking Midfielder',
# 'Left Centre Back', 'Cente Forward', 'Long Snapper', 'Goalkeeper',
# 'Left Wing Back', 'Left Forward', 'Right Defensive Midfielder',
# 'Right Forward'
The most number of player population is for the Striker, which is followed by Center Back Position and The goal keeper positions.
age_cnt=df.groupby('Age').apply(lambda x:x['Name'].count()).reset_index(name='Counts')
fig=px.bar(age_cnt,x='Age',y='Counts',color='Counts',title='Agewise Player distribution in FIFA 21')
fig.show()
The chart looks like a normal curve which is left skewed. On an average 20-24 is the average age for most of the footballers.
top_play=df[['Name','Overall',"Age",'Club','Position']]
top_play.sort_values(by='Overall',ascending=False,inplace=True)
top_30_play=top_play[:100]
fig=px.scatter(top_30_play,x='Age',y='Overall',color='Age',size='Overall',hover_data=['Name','Club','Position'],title='Top Football Players in the FIFA 21 game')
fig.show()
C:\Users\91989\anaconda\lib\site-packages\pandas\util\_decorators.py:311: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Lionel Messi tops the board and then Cristiano Ronaldo. Among the youngest players, Kylian Mbappe, Jadon Snacho and Trent Alexander Arnold deserve a special mention.
cond_1=df['Overall']!=df['Potential']
cond_2=df['Age']<25
df_fil=df[cond_1 & cond_2]
potential_play=df_fil[['Name','Age','Nationality','Club','Potential','Position','Overall','Value']]
potential_play.sort_values(by='Potential',ascending=False,inplace=True)
top_potential_play=potential_play[:50]
fig=px.scatter(potential_play,x='Age',y='Potential',size='Potential',color='Age',hover_data=['Name','Age','Nationality','Position','Overall','Value'],title='Age vs Maximum Potential Distribution of the young Players')
fig.show()
The Potential cannot be equal to the Overall Score, and the age of the players must be smaller than 25
final_team=df_fil[['Name','Age','Potential','Position','Club']]
final_team.sort_values(by='Age',inplace=True)
Position_play=final_team.groupby('Position').apply(lambda x:np.max(x['Potential'])).reset_index(name='Potential')
player_pos=pd.merge(final_team,Position_play,how='inner',left_on=['Position','Potential'],right_on=['Position','Potential'])
Position_best=player_pos[['Name','Club','Age','Position','Potential']]
cm = sns.light_palette("black", as_cmap=True)
Position_best.style.background_gradient(cmap=cm).set_precision(2)
<ipython-input-22-6d068811c047>:7: FutureWarning: this method is deprecated in favour of `Styler.format(precision=..)`
| Name | Club | Age | Position | Potential | |
|---|---|---|---|---|---|
| 0 | Florian Wirtz | Bayer 04 Leverkusen | 17 | RCM | 88 |
| 1 | Isak Jansson | Kalmar FF | 18 | RF | 73 |
| 2 | Nuno Alexandre Tavares Mendes | Sporting CP | 18 | LWB | 87 |
| 3 | Reinier Jesus Carvalho | Borussia Dortmund | 18 | CF | 87 |
| 4 | Oliver Skipp | Norwich City | 19 | RDM | 84 |
| 5 | Jeremie Frimpong | Celtic | 19 | RWB | 86 |
| 6 | Thiago Almada | Vélez Sarsfield | 19 | LCM | 89 |
| 7 | Houssem Aouar | Olympique Lyonnais | 22 | LCM | 89 |
| 8 | Alphonso Davies | FC Bayern München | 19 | LB | 89 |
| 9 | Sergio Reguilón Rodríguez | Tottenham Hotspur | 23 | LB | 89 |
| 10 | Dominik Szoboszlai | FC Red Bull Salzburg | 19 | LAM | 87 |
| 11 | Vinícius José de Oliveira Júnior | Real Madrid | 19 | LW | 93 |
| 12 | Matthijs de Ligt | Juventus | 20 | CB | 92 |
| 13 | João Félix Sequeira | Atlético Madrid | 20 | RS | 93 |
| 14 | Jadon Sancho | Borussia Dortmund | 20 | RW | 93 |
| 15 | Antony Matheus dos Santos | Ajax | 20 | RAM | 88 |
| 16 | Sandro Tonali | Milan | 20 | CDM | 91 |
| 17 | Nils Fröling | Kalmar FF | 20 | LF | 79 |
| 18 | Patson Daka | FC Red Bull Salzburg | 21 | LS | 85 |
| 19 | Maximiliano Gómez | Valencia CF | 23 | LS | 85 |
| 20 | Moussa Dembélé | Olympique Lyonnais | 23 | LS | 85 |
| 21 | Emerson Leite De Souza | Real Betis | 21 | RB | 88 |
| 22 | Aaron Wan-Bissaka | Manchester United | 22 | RB | 88 |
| 23 | Kai Havertz | Chelsea | 21 | CAM | 93 |
| 24 | Edmond Tapsoba | Bayer 04 Leverkusen | 21 | RCB | 88 |
| 25 | Joe Gomez | Liverpool | 23 | RCB | 88 |
| 26 | Kylian Mbappé | Paris Saint-Germain | 21 | RM | 95 |
| 27 | Federico Valverde | Real Madrid | 21 | CM | 90 |
| 28 | Gianluigi Donnarumma | Milan | 21 | GK | 92 |
| 29 | Lautaro Martínez | Inter | 22 | ST | 91 |
| 30 | Marcus Rashford | Manchester United | 22 | LM | 91 |
| 31 | Frenkie de Jong | FC Barcelona | 23 | LDM | 90 |
| 32 | Niklas Süle | FC Bayern München | 24 | LCB | 89 |
india_team_all_players=df[df['Nationality']=='India']
india_team_2=india_team_all_players[['Name','Age','Overall','Position']]
india_team_2.sort_values(by='Position',inplace=True)
age_play=india_team_2.groupby('Position').apply(lambda x:np.max(x['Overall'])).reset_index(name='Overall')
player_pos=pd.merge(india_team_2,age_play,how='inner',left_on=['Position','Overall'],right_on=['Position','Overall'])
Position_best=player_pos[['Name','Age','Position','Overall']]
cm = sns.light_palette("cyan", as_cmap=True)
Position_best.style.background_gradient(cmap=cm).set_precision(2)
<ipython-input-23-63178736c7f9>:8: FutureWarning: this method is deprecated in favour of `Styler.format(precision=..)`
| Name | Age | Position | Overall | |
|---|---|---|---|---|
| 0 | Remil Nadkarni | 34 | CAM | 59 |
| 1 | Anuvinda Khurana | 27 | CB | 59 |
| 2 | Bismeet Sidhu | 32 | CDM | 60 |
| 3 | Gajodara Chatterjee | 34 | GK | 64 |
| 4 | Adit Ginti | 26 | LB | 61 |
| 5 | Abhimoda Chakraborty | 34 | LCB | 61 |
| 6 | Chapal Palan | 29 | LCM | 57 |
| 7 | Attana Deshpande | 39 | LM | 60 |
| 8 | Halicharan Narzary | 24 | LM | 60 |
| 9 | Sunil Chhetri | 33 | LS | 67 |
| 10 | Lalrindika Ralte | 25 | LW | 61 |
| 11 | Bhadrashree Raj | 32 | RB | 64 |
| 12 | Tapish Atwal | 36 | RCB | 59 |
| 13 | Anvit Swaminathan | 28 | RCM | 61 |
| 14 | Hantidev Bhandari | 31 | RM | 62 |
| 15 | Jeje Lalpekhlua | 27 | ST | 63 |
Representation of players with their age, position and potential
Correlation plot
corr_ = df.corr()
f,ax = plt.subplots(figsize=(25, 10))
sns.heatmap(corr_,annot=True, linewidths=0.5, cmap="YlGnBu", fmt= '.1f',ax=ax)
plt.show()
# Dropping records not having value greater than zero to avoid null values
df_new = df[(df['Value'] > 0)]
# function for transforming each positions to particular main position into 4 major positions as forward, midfielder, defender and goalkeeper
def complex_function(vc):
if vc in ['ST','RW','RS','LW','CF','LS','LF','RF']:
return 'Forward'
elif vc in ['RM','LM','LCM','CM','CAM','RAM','RCM','LDM','CDM','RDM','LAM']:
return 'Midfielder'
elif vc in ['RB','CB','LB','RCB','RWB','LCB','LWB']:
return 'Defender'
else:
return 'Goalkeeper'
# Applying function to create new field based on certain transformation
df_new['Grouped_Position'] = df_new['Position'].apply(complex_function)
<ipython-input-17-49d2c1f136c3>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# Adding the new transformed position field as Y df and dropped position field and made that as x df
x = df_new.drop(['Position','Grouped_Position'], axis = 1)
y = df_new['Grouped_Position']
print("Shape of x :", x.shape)
print("Shape of y :", y.shape)
Shape of x : (18033, 53) Shape of y : (18033,)
feature_name = list(x.columns)
# no of maximum features we need to select
num_feats=30
Label Encoding
# Label Encoding to convert the features to model understandable format
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in range(0,x.shape[1]):
if x.dtypes[i]=='object':
x[x.columns[i]] = le.fit_transform(x[x.columns[i]])
print(x)
Name Age Nationality Overall Potential Club Value \
0 5283 18 162 71 87 598 4900000.0
1 4574 32 162 86 86 588 35500000.0
2 6299 17 161 68 87 116 1800000.0
3 14368 23 21 81 85 755 23000000.0
4 7594 28 34 82 82 280 22500000.0
... ... ... ... ... ... ... ...
18535 14662 39 21 75 75 767 375000.0
18536 15187 37 57 77 77 754 600000.0
18537 15851 34 49 68 68 548 425000.0
18539 16275 32 23 76 76 75 2200000.0
18540 16303 35 149 67 67 389 110000.0
Wage Special Preferred Foot ... Vision Penalties \
0 500.0 1729 1 ... 63.000000 66.0
1 150000.0 2144 1 ... 75.000000 85.0
2 2000.0 1671 1 ... 65.000000 50.0
3 50000.0 2071 0 ... 75.000000 73.0
4 105000.0 2099 0 ... 83.000000 81.0
... ... ... ... ... ... ...
18535 10000.0 1730 0 ... 55.119404 83.0
18536 15000.0 1086 1 ... 31.000000 23.0
18537 6000.0 1744 1 ... 68.000000 64.0
18539 20000.0 1920 1 ... 76.000000 67.0
18540 6000.0 1413 0 ... 40.000000 51.0
Composure Standing Tackle Sliding Tackle GK Diving GK Handling \
0 61.000000 11.0 18.0 9.0 12.0
1 80.000000 48.0 39.0 12.0 5.0
2 59.000000 23.0 24.0 10.0 13.0
3 79.000000 54.0 38.0 8.0 7.0
4 87.000000 41.0 44.0 15.0 15.0
... ... ... ... ... ...
18535 59.603052 23.0 24.0 11.0 9.0
18536 20.000000 11.0 11.0 76.0 76.0
18537 70.000000 27.0 33.0 14.0 9.0
18539 59.603052 62.0 70.0 9.0 9.0
18540 59.603052 65.0 69.0 13.0 15.0
GK Kicking GK Positioning GK Reflexes
0 7.0 8.0 7.0
1 13.0 13.0 10.0
2 14.0 12.0 7.0
3 13.0 8.0 14.0
4 15.0 5.0 14.0
... ... ... ...
18535 14.0 10.0 11.0
18536 77.0 77.0 77.0
18537 14.0 11.0 12.0
18539 7.0 9.0 15.0
18540 9.0 5.0 5.0
[18033 rows x 53 columns]
Feature selection using selectkBest technique
# converted the features to given range and selected the top 30 features
x_norm = MinMaxScaler().fit_transform(x)
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(x_norm, y)
chi_support = chi_selector.get_support()
chi_feature = x.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')
30 selected features
# Displaying the top 30 features
chi_feature
['Special', 'Skill Moves', 'Crossing', 'Finishing', 'Heading Accuracy', 'Short Passing', 'Volleys', 'Dribbling', 'Curve', 'FK Accuracy', 'Long Passing', 'Ball Control', 'Acceleration', 'Sprint Speed', 'Agility', 'Balance', 'Stamina', 'Long Shots', 'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Standing Tackle', 'Sliding Tackle', 'GK Diving', 'GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes']
y = df_new['Grouped_Position']
x = df_new.drop(['Age', 'Nationality', 'Overall', 'Potential', 'Club', 'Value', 'Wage', 'Position', 'Grouped_Position', 'Weak Foot', 'International Reputation', 'Work Rate', 'Body Type', 'Weight', 'Likes', 'Dislikes', 'Following', 'Reactions', 'Balance', 'Shot Power', 'Jumping', 'Strength', 'Vision', 'Composure'], axis = 1)
print("Shape of x :", x.shape)
print("Shape of y :", y.shape)
Shape of x : (18033, 31) Shape of y : (18033,)
# Label Encoding for final dataframe
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in range(0,x.shape[1]):
if x.dtypes[i]=='object':
x[x.columns[i]] = le.fit_transform(x[x.columns[i]])
print(x)
Name Special Preferred Foot Skill Moves Height Crossing \
0 5283 1729 1 3 10 67.000000
1 4574 2144 1 3 12 68.000000
2 6299 1671 1 6 12 54.000000
3 14368 2071 0 6 10 76.000000
4 7594 2099 0 6 2 90.000000
... ... ... ... ... ... ...
18535 14662 1730 0 8 12 51.262508
18536 15187 1086 1 0 12 10.000000
18537 15851 1744 1 3 6 69.000000
18539 16275 1920 1 3 2 61.000000
18540 16303 1413 0 1 12 44.000000
Finishing Heading Accuracy Short Passing Volleys ... \
0 60.00000 50.0 67.00000 56.000000 ...
1 87.00000 89.0 75.00000 88.000000 ...
2 59.00000 42.0 73.00000 59.000000 ...
3 72.00000 45.0 74.00000 72.000000 ...
4 84.00000 62.0 83.00000 90.000000 ...
... ... ... ... ... ...
18535 47.51451 64.0 60.13258 44.687997 ...
18536 10.00000 10.0 24.00000 10.000000 ...
18537 61.00000 36.0 65.00000 70.000000 ...
18539 65.00000 70.0 83.00000 73.000000 ...
18540 21.00000 75.0 55.00000 23.000000 ...
Interceptions Positioning Penalties Standing Tackle Sliding Tackle \
0 31.0 65.000000 66.0 11.0 18.0
1 54.0 92.000000 85.0 48.0 39.0
2 31.0 56.000000 50.0 23.0 24.0
3 55.0 81.000000 73.0 54.0 38.0
4 55.0 80.000000 81.0 41.0 44.0
... ... ... ... ... ...
18535 21.0 51.943673 83.0 23.0 24.0
18536 27.0 16.000000 23.0 11.0 11.0
18537 39.0 66.000000 64.0 27.0 33.0
18539 82.0 65.000000 67.0 62.0 70.0
18540 65.0 39.000000 51.0 65.0 69.0
GK Diving GK Handling GK Kicking GK Positioning GK Reflexes
0 9.0 12.0 7.0 8.0 7.0
1 12.0 5.0 13.0 13.0 10.0
2 10.0 13.0 14.0 12.0 7.0
3 8.0 7.0 13.0 8.0 14.0
4 15.0 15.0 15.0 5.0 14.0
... ... ... ... ... ...
18535 11.0 9.0 14.0 10.0 11.0
18536 76.0 76.0 77.0 77.0 77.0
18537 14.0 9.0 14.0 11.0 12.0
18539 9.0 9.0 7.0 9.0 15.0
18540 13.0 15.0 9.0 5.0 5.0
[18033 rows x 31 columns]
Splitting the dataframe for train and test validation approach
# using the train test split validation approach
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
print("Shape of X_train :", X_train.shape)
print("Shape of X_test :", X_test.shape)
print("Shape of y_train :", y_train.shape)
print("Shape of y_test :", y_test.shape)
Shape of X_train : (12623, 31) Shape of X_test : (5410, 31) Shape of y_train : (12623,) Shape of y_test : (5410,)
# the counts of the target variable to check the data imbalance
print('Classes and number of values in trainset',Counter(y_train))
Classes and number of values in trainset Counter({'Midfielder': 4743, 'Defender': 4022, 'Forward': 2663, 'Goalkeeper': 1195})
# Adaptive Synthetic Sampling Approach used for saampling the data
oversample = ADASYN(sampling_strategy='minority')
X_train,y_train = oversample.fit_resample(X_train,y_train)
print('Classes and number of values in trainset after ADSYN:',Counter(y_train))
Classes and number of values in trainset after ADSYN: Counter({'Midfielder': 4743, 'Goalkeeper': 4739, 'Defender': 4022, 'Forward': 2663})
# Standard Scaling technique applied to the train and test dataframe
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
Model Building using 4 different Ensemble Machine Learning Models like Support Vector Classifier, Logisitic Regressor Classifier, Random Forest Classifier, and Decision Tree Classifier which works on bagging and boosting approach and will obviously helps in improving the model performance when compared to all the other traditional models
SVC Base Model
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
svc = SVC( kernel= 'rbf', gamma= 0.1, C= 9)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
# print('Training Accuracy: %1.3f.' % svc.score(X_train, y_train))
#10-fold cross validation score
cv = cross_val_score(estimator = svc, X = X_train, y = y_train, cv =3)
SVC_cv = cv.mean()
print("10 fold cross validation :", SVC_cv)
from sklearn.metrics import accuracy_score
SVC_ac = accuracy_score(y_pred,y_test)
print("accuracy :", SVC_ac)
from sklearn.metrics import precision_score
SVC_p = precision_score(y_pred,y_test,average='weighted')
print("precision :", SVC_p)
from sklearn.metrics import recall_score
SVC_r = recall_score(y_pred,y_test,average='weighted')
print("recall :", SVC_r)
from sklearn.metrics import f1_score
SVC_f1 = f1_score(y_test, y_pred,average='weighted')
print("F1 Score :", SVC_f1)
10 fold cross validation : 0.8909507020473805 accuracy : 0.8604436229205176 precision : 0.8610622371180854 recall : 0.8604436229205176 F1 Score : 0.8602794204310344
Hyperparameter Tuning of the SVC Model
svc_params = {'C': range(1, 10, 1), 'gamma': np.arange(0.1, 1, 0.1), 'kernel': ['rbf', 'linear']}
random_search = RandomizedSearchCV(estimator = svc, param_distributions = svc_params, n_iter = 20, cv = 5, verbose=2, random_state=42, n_jobs = -1)
random_search.fit(X_train, y_train)
Fitting 5 folds for each of 20 candidates, totalling 100 fits
RandomizedSearchCV(cv=5, estimator=SVC(C=9, gamma=0.1), n_iter=20, n_jobs=-1,
param_distributions={'C': range(1, 10),
'gamma': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
'kernel': ['rbf', 'linear']},
random_state=42, verbose=2)
Printing the Best Hyperparameter values
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
All results:
{'mean_fit_time': array([68.2390811 , 15.48659596, 19.58205657, 13.26078887, 27.30172582,
10.9647934 , 15.49956112, 11.71959081, 49.56279974, 11.19839506,
10.26906362, 60.39004378, 8.69066701, 61.01321259, 38.1889441 ,
61.68121591, 18.96536489, 69.70167737, 46.55615177, 67.94903998]), 'std_fit_time': array([2.16893897, 2.08876823, 0.81166038, 0.75011596, 0.98680485,
0.25964075, 0.50468244, 0.11052728, 1.95869859, 0.68172382,
0.31046006, 4.39795038, 0.11590589, 3.70234192, 1.048439 ,
4.44608852, 0.56081856, 2.10189924, 2.0910459 , 1.01146143]), 'mean_score_time': array([15.12459397, 1.31876559, 1.39826078, 1.30595975, 9.20751724,
1.23598275, 1.332445 , 1.23233213, 13.77409987, 5.77425814,
1.36029897, 15.14016943, 1.34795499, 15.18642173, 11.90248518,
15.37271132, 1.35440249, 15.76266313, 13.31543345, 14.29841886]), 'std_score_time': array([0.07841032, 0.1367208 , 0.0847233 , 0.14839956, 0.58882183,
0.04765614, 0.0905552 , 0.10572927, 0.4286374 , 0.6298421 ,
0.10532052, 0.89086126, 0.03448069, 0.37851863, 0.14915417,
0.52726209, 0.0773279 , 0.71106932, 1.44082216, 0.74783377]), 'param_kernel': masked_array(data=['rbf', 'linear', 'linear', 'linear', 'rbf', 'linear',
'linear', 'linear', 'rbf', 'rbf', 'linear', 'rbf',
'linear', 'rbf', 'rbf', 'rbf', 'linear', 'rbf', 'rbf',
'rbf'],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False],
fill_value='?',
dtype=object), 'param_gamma': masked_array(data=[0.8, 0.1, 0.30000000000000004, 0.1,
0.30000000000000004, 0.6, 0.6, 0.8, 0.6, 0.1, 0.1,
0.7000000000000001, 0.8, 0.7000000000000001, 0.4,
0.7000000000000001, 0.9, 0.8, 0.5, 0.9],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False],
fill_value='?',
dtype=object), 'param_C': masked_array(data=[9, 7, 8, 4, 6, 2, 6, 3, 6, 9, 2, 5, 1, 4, 2, 2, 8, 6,
6, 1],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False],
fill_value='?',
dtype=object), 'params': [{'kernel': 'rbf', 'gamma': 0.8, 'C': 9}, {'kernel': 'linear', 'gamma': 0.1, 'C': 7}, {'kernel': 'linear', 'gamma': 0.30000000000000004, 'C': 8}, {'kernel': 'linear', 'gamma': 0.1, 'C': 4}, {'kernel': 'rbf', 'gamma': 0.30000000000000004, 'C': 6}, {'kernel': 'linear', 'gamma': 0.6, 'C': 2}, {'kernel': 'linear', 'gamma': 0.6, 'C': 6}, {'kernel': 'linear', 'gamma': 0.8, 'C': 3}, {'kernel': 'rbf', 'gamma': 0.6, 'C': 6}, {'kernel': 'rbf', 'gamma': 0.1, 'C': 9}, {'kernel': 'linear', 'gamma': 0.1, 'C': 2}, {'kernel': 'rbf', 'gamma': 0.7000000000000001, 'C': 5}, {'kernel': 'linear', 'gamma': 0.8, 'C': 1}, {'kernel': 'rbf', 'gamma': 0.7000000000000001, 'C': 4}, {'kernel': 'rbf', 'gamma': 0.4, 'C': 2}, {'kernel': 'rbf', 'gamma': 0.7000000000000001, 'C': 2}, {'kernel': 'linear', 'gamma': 0.9, 'C': 8}, {'kernel': 'rbf', 'gamma': 0.8, 'C': 6}, {'kernel': 'rbf', 'gamma': 0.5, 'C': 6}, {'kernel': 'rbf', 'gamma': 0.9, 'C': 1}], 'split0_test_score': array([0.76530612, 0.90445269, 0.90445269, 0.90414348, 0.88033395,
0.9047619 , 0.90414348, 0.9047619 , 0.82653061, 0.89888683,
0.9047619 , 0.79746444, 0.9047619 , 0.79746444, 0.8729128 ,
0.79931973, 0.90445269, 0.76530612, 0.84662956, 0.70717378]), 'split1_test_score': array([0.80952381, 0.89424861, 0.89393939, 0.89332096, 0.88435374,
0.89332096, 0.89424861, 0.89332096, 0.85312307, 0.89703154,
0.89332096, 0.82869511, 0.89393939, 0.82869511, 0.88651824,
0.82807669, 0.89393939, 0.80952381, 0.87167594, 0.77489177]), 'split2_test_score': array([0.81596041, 0.89266935, 0.89266935, 0.89266935, 0.88400866,
0.89266935, 0.89266935, 0.89266935, 0.85988246, 0.89359728,
0.89266935, 0.83884937, 0.89266935, 0.83884937, 0.88277142,
0.84008661, 0.89266935, 0.81596041, 0.87349211, 0.76616146]), 'split3_test_score': array([0.83606557, 0.89514383, 0.89452521, 0.89514383, 0.88277142,
0.89514383, 0.89514383, 0.89483452, 0.86792453, 0.89174142,
0.89514383, 0.85307764, 0.89483452, 0.85307764, 0.88864831,
0.85276833, 0.89452521, 0.83606557, 0.87967832, 0.79616455]), 'split4_test_score': array([0.8004949 , 0.90504176, 0.90504176, 0.90504176, 0.88617383,
0.90473245, 0.90504176, 0.90504176, 0.8580266 , 0.89607176,
0.90473245, 0.83235385, 0.90504176, 0.83235385, 0.888339 ,
0.83266316, 0.90504176, 0.8004949 , 0.87349211, 0.75409836]), 'mean_test_score': array([0.80547016, 0.89831125, 0.89812568, 0.89806387, 0.88352832,
0.8981257 , 0.8982494 , 0.8981257 , 0.85309745, 0.89546576,
0.8981257 , 0.83008808, 0.89824938, 0.83008808, 0.88383796,
0.8305829 , 0.89812568, 0.80547016, 0.86899361, 0.75969799]), 'std_test_score': array([0.02323988, 0.00531763, 0.00544285, 0.00539954, 0.00193327,
0.00546694, 0.00524717, 0.00557782, 0.01411363, 0.00252525,
0.00546694, 0.01831205, 0.00547583, 0.01831205, 0.00584943,
0.01772291, 0.00544285, 0.02323988, 0.01150636, 0.02963979]), 'rank_test_score': array([18, 1, 7, 9, 12, 4, 2, 4, 14, 10, 4, 16, 3, 16, 11, 15, 7,
18, 13, 20])}
Best estimator:
SVC(C=7, gamma=0.1, kernel='linear')
Best hyperparameters:
{'kernel': 'linear', 'gamma': 0.1, 'C': 7}
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
tuned_svc = SVC( kernel= 'linear', gamma= 0.1, C= 7)
tuned_svc.fit(X_train, y_train)
y_pred = tuned_svc.predict(X_test)
#10-fold cross validation score
cv = cross_val_score(estimator = svc, X = X_train, y = y_train, cv =3)
tuned_SVC_cv = cv.mean()
print("10 fold cross validation :", SVC_cv)
from sklearn.metrics import accuracy_score
tuned_SVC_ac = accuracy_score(y_pred,y_test)
print("accuracy :", SVC_ac)
from sklearn.metrics import precision_score
tuned_SVC_p = precision_score(y_pred,y_test,average='weighted')
print("precision :", SVC_p)
from sklearn.metrics import recall_score
tuned_SVC_r = recall_score(y_pred,y_test,average='weighted')
print("recall :", SVC_r)
from sklearn.metrics import f1_score
tuned_SVC_f1 = f1_score(y_test, y_pred,average='weighted')
print("F1 Score :", SVC_f1)
10 fold cross validation : 0.8909507020473805 accuracy : 0.8604436229205176 precision : 0.8610622371180854 recall : 0.8604436229205176 F1 Score : 0.8602794204310344
Logisitic Regression Classifier Base Model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
lr = LogisticRegression(penalty = 'l2', C = 0.01 )
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
#10-fold cross validation score
cv = cross_val_score(estimator = lr, X = X_train, y = y_train, cv =3)
log_cv = cv.mean()
print("10 fold cross validation :", log_cv)
from sklearn.metrics import accuracy_score
log_ac = accuracy_score(y_pred,y_test)
print("accuracy :", log_ac)
from sklearn.metrics import precision_score
log_p = precision_score(y_pred,y_test,average='weighted')
print("precision :", log_p)
from sklearn.metrics import recall_score
log_r = recall_score(y_pred,y_test,average='weighted')
print("recall :", log_r)
from sklearn.metrics import f1_score
log_f1 = f1_score(y_test, y_pred,average='weighted')
print("F1 Score :", log_f1)
10 fold cross validation : 0.8915073916001731 accuracy : 0.865988909426987 precision : 0.8660867453574451 recall : 0.865988909426987 F1 Score : 0.8660971494408414
Hyperparameter Tuning of the Logistic Regression Classifier
#Parameters for Hyper Parameter Tuning
lr_param={"C":np.logspace(-3,3,7), "penalty":["l1","l2"],}# l1 lasso l2 ridge
param_comb = 100
#Hyper parameter Tuning of Logistic regression
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(lr, param_distributions=lr_param, n_iter=param_comb, scoring='accuracy', n_jobs= -1, verbose=3, random_state=10 )
random_search.fit(X_train, y_train)
C:\Users\91989\anaconda\lib\site-packages\sklearn\model_selection\_search.py:292: UserWarning: The total space of parameters 14 is smaller than n_iter=100. Running 14 iterations. For exhaustive searches, use GridSearchCV.
Fitting 5 folds for each of 14 candidates, totalling 70 fits
C:\Users\91989\anaconda\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning:
35 fits failed out of a total of 70.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\91989\anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\91989\anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
solver = _check_solver(self.solver, self.penalty, self.dual)
File "C:\Users\91989\anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
C:\Users\91989\anaconda\lib\site-packages\sklearn\model_selection\_search.py:969: UserWarning:
One or more of the test scores are non-finite: [ nan 0.87270336 nan 0.89212563 nan 0.89769242
nan 0.89818746 nan 0.89800201 nan 0.89794013
nan 0.89781641]
C:\Users\91989\anaconda\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
RandomizedSearchCV(estimator=LogisticRegression(C=0.01), n_iter=100, n_jobs=-1,
param_distributions={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
'penalty': ['l1', 'l2']},
random_state=10, scoring='accuracy', verbose=3)
Printing the Best Hyperparameter values
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
All results:
{'mean_fit_time': array([0.00800533, 0.45098314, 0.01099806, 0.8821537 , 0.00872512,
1.12705445, 0.00943842, 1.1424911 , 0.00910869, 1.15644922,
0.0103354 , 1.10911641, 0.01041908, 0.91872277]), 'std_fit_time': array([0.00108684, 0.02280475, 0.00316146, 0.04443592, 0.00162511,
0.04627641, 0.00098607, 0.04112939, 0.00092771, 0.03340425,
0.00096607, 0.05986003, 0.00104655, 0.07881173]), 'mean_score_time': array([0. , 0.00541573, 0. , 0.00510416, 0. ,
0.00580802, 0. , 0.00542779, 0. , 0.00611739,
0. , 0.00633936, 0. , 0.00379386]), 'std_score_time': array([0. , 0.00037446, 0. , 0.00052716, 0. ,
0.00097798, 0. , 0.00050467, 0. , 0.0003661 ,
0. , 0.00093869, 0. , 0.00074583]), 'param_penalty': masked_array(data=['l1', 'l2', 'l1', 'l2', 'l1', 'l2', 'l1', 'l2', 'l1',
'l2', 'l1', 'l2', 'l1', 'l2'],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False],
fill_value='?',
dtype=object), 'param_C': masked_array(data=[0.001, 0.001, 0.01, 0.01, 0.1, 0.1, 1.0, 1.0, 10.0,
10.0, 100.0, 100.0, 1000.0, 1000.0],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False],
fill_value='?',
dtype=object), 'params': [{'penalty': 'l1', 'C': 0.001}, {'penalty': 'l2', 'C': 0.001}, {'penalty': 'l1', 'C': 0.01}, {'penalty': 'l2', 'C': 0.01}, {'penalty': 'l1', 'C': 0.1}, {'penalty': 'l2', 'C': 0.1}, {'penalty': 'l1', 'C': 1.0}, {'penalty': 'l2', 'C': 1.0}, {'penalty': 'l1', 'C': 10.0}, {'penalty': 'l2', 'C': 10.0}, {'penalty': 'l1', 'C': 100.0}, {'penalty': 'l2', 'C': 100.0}, {'penalty': 'l1', 'C': 1000.0}, {'penalty': 'l2', 'C': 1000.0}], 'split0_test_score': array([ nan, 0.87940631, nan, 0.90074212, nan,
0.90290662, nan, 0.90352505, nan, 0.90136054,
nan, 0.90228819, nan, 0.90197897]), 'split1_test_score': array([ nan, 0.8707483 , nan, 0.88837353, nan,
0.89919604, nan, 0.8961039 , nan, 0.8961039 ,
nan, 0.89548547, nan, 0.89579468]), 'split2_test_score': array([ nan, 0.86019177, nan, 0.88277142, nan,
0.89050418, nan, 0.89236004, nan, 0.89297866,
nan, 0.89205073, nan, 0.89205073]), 'split3_test_score': array([ nan, 0.87596659, nan, 0.89205073, nan,
0.89328797, nan, 0.8942159 , nan, 0.89483452,
nan, 0.89514383, nan, 0.89483452]), 'split4_test_score': array([ nan, 0.87720384, nan, 0.89669038, nan,
0.90256727, nan, 0.90473245, nan, 0.90473245,
nan, 0.90473245, nan, 0.90442314]), 'mean_test_score': array([ nan, 0.87270336, nan, 0.89212563, nan,
0.89769242, nan, 0.89818746, nan, 0.89800201,
nan, 0.89794013, nan, 0.89781641]), 'std_test_score': array([ nan, 0.00687306, nan, 0.00627162, nan,
0.00498564, nan, 0.005008 , nan, 0.00436917,
nan, 0.00476594, nan, 0.00463029]), 'rank_test_score': array([ 8, 7, 9, 6, 10, 5, 11, 1, 12, 2, 13, 3, 14, 4])}
Best estimator:
LogisticRegression()
Best hyperparameters:
{'penalty': 'l2', 'C': 1.0}
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
tuned_lr = LogisticRegression(penalty = 'l2', C = 1.0 )
tuned_lr.fit(X_train, y_train)
y_pred = tuned_lr.predict(X_test)
#10-fold cross validation score
cv = cross_val_score(estimator = lr, X = X_train, y = y_train, cv =3)
tuned_log_cv = cv.mean()
print("10 fold cross validation :", log_cv)
from sklearn.metrics import accuracy_score
tuned_log_ac = accuracy_score(y_pred,y_test)
print("accuracy :", log_ac)
from sklearn.metrics import precision_score
tuned_log_p = precision_score(y_pred,y_test,average='weighted')
print("precision :", log_p)
from sklearn.metrics import recall_score
tuned_log_r = recall_score(y_pred,y_test,average='weighted')
print("recall :", log_r)
from sklearn.metrics import f1_score
tuned_log_f1 = f1_score(y_test, y_pred,average='weighted')
print("F1 Score :", log_f1)
C:\Users\91989\anaconda\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
10 fold cross validation : 0.8915073916001731 accuracy : 0.865988909426987 precision : 0.8660867453574451 recall : 0.865988909426987 F1 Score : 0.8660971494408414
Random Forest Classifier Base Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(n_estimators = 1860, min_samples_split = 2, min_samples_leaf = 4, max_features='auto', max_depth = 70, bootstrap = 'True')
rf.fit(X_train, y_train)
# Predicting the Test set results
y_pred = rf.predict(X_test)
#10-fold cross validation score
cv = cross_val_score(estimator = rf, X = X_train, y = y_train, cv =3)
rf_cv = cv.mean()
print("10 fold cross validation :", rf_cv)
from sklearn.metrics import accuracy_score
rf_ac = accuracy_score(y_pred,y_test)
print("accuracy :", rf_ac)
from sklearn.metrics import precision_score
rf_p = precision_score(y_pred,y_test,average='weighted')
print("precision :", rf_p)
from sklearn.metrics import recall_score
rf_r = recall_score(y_pred,y_test,average='weighted')
print("recall :", rf_r)
from sklearn.metrics import f1_score
rf_f1 = f1_score(y_test, y_pred,average='weighted')
print("F1 Score :", rf_f1)
10 fold cross validation : 0.901094822787159 accuracy : 0.8739371534195933 precision : 0.8749279444562226 recall : 0.8739371534195933 F1 Score : 0.8738406313696206
Hyperparameter Tuning of the Random Forest Classifier
from sklearn.model_selection import RandomizedSearchCV
max_features = ['auto', 'sqrt']
n_estimators = range(200, 2000, 10)
max_depth = range(10, 110, 10)
min_samples_split = range(2, 10, 1)
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
param_comb = 100
random_search = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 2, verbose=2, random_state=42, n_jobs = -1)
random_search.fit(X_train, y_train)
Fitting 2 folds for each of 10 candidates, totalling 20 fits
RandomizedSearchCV(cv=2,
estimator=RandomForestClassifier(bootstrap='True',
max_depth=70,
min_samples_leaf=4,
n_estimators=1860),
n_jobs=-1,
param_distributions={'bootstrap': [True, False],
'max_depth': range(10, 110, 10),
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': range(2, 10),
'n_estimators': range(200, 2000, 10)},
random_state=42, verbose=2)
Printing the Best Hyperparameter Values
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
All results:
{'mean_fit_time': array([ 59.00887847, 94.96244752, 102.71881282, 15.7013253 ,
105.37056756, 69.40369427, 61.39910555, 105.86983871,
89.3273015 , 16.00332642]), 'std_fit_time': array([2.15685344, 1.72794354, 0.31142914, 0.34644377, 2.51075745,
1.27985775, 1.83904052, 3.47259951, 1.15950322, 0.18060207]), 'mean_score_time': array([4.63433254, 7.65199649, 9.01930237, 1.03971982, 9.75910258,
4.83634305, 5.018134 , 9.88080311, 8.24152803, 1.00658846]), 'std_score_time': array([0.08076584, 0.38120592, 0.48870373, 0.02419591, 0.25999403,
0.22495484, 0.00219953, 0.02264977, 0.58476591, 0.02843165]), 'param_n_estimators': masked_array(data=[1180, 1870, 1920, 340, 1990, 1280, 1860, 1970, 1660,
380],
mask=[False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_min_samples_split': masked_array(data=[7, 9, 6, 2, 3, 6, 2, 4, 8, 8],
mask=[False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_min_samples_leaf': masked_array(data=[1, 4, 2, 1, 4, 2, 4, 4, 4, 1],
mask=[False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_max_features': masked_array(data=['auto', 'sqrt', 'auto', 'auto', 'sqrt', 'sqrt', 'auto',
'sqrt', 'auto', 'auto'],
mask=[False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_max_depth': masked_array(data=[50, 70, 60, 30, 40, 30, 70, 60, 100, 10],
mask=[False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_bootstrap': masked_array(data=[False, False, False, False, False, False, True, False,
False, False],
mask=[False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'params': [{'n_estimators': 1180, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 50, 'bootstrap': False}, {'n_estimators': 1870, 'min_samples_split': 9, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 70, 'bootstrap': False}, {'n_estimators': 1920, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 60, 'bootstrap': False}, {'n_estimators': 340, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 30, 'bootstrap': False}, {'n_estimators': 1990, 'min_samples_split': 3, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False}, {'n_estimators': 1280, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': False}, {'n_estimators': 1860, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 70, 'bootstrap': True}, {'n_estimators': 1970, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': False}, {'n_estimators': 1660, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 100, 'bootstrap': False}, {'n_estimators': 380, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10, 'bootstrap': False}], 'split0_test_score': array([0.89930727, 0.89807026, 0.89831766, 0.89955468, 0.89794656,
0.89831766, 0.89683325, 0.89819396, 0.89745176, 0.89398812]), 'split1_test_score': array([0.90337746, 0.90214029, 0.90387232, 0.90337746, 0.90300631,
0.90337746, 0.90201658, 0.90238773, 0.90313003, 0.89879995]), 'mean_test_score': array([0.90134237, 0.90010528, 0.90109499, 0.90146607, 0.90047644,
0.90084756, 0.89942491, 0.90029085, 0.90029089, 0.89639404]), 'std_test_score': array([0.00203509, 0.00203502, 0.00277733, 0.00191139, 0.00252987,
0.0025299 , 0.00259166, 0.00209688, 0.00283913, 0.00240591]), 'rank_test_score': array([ 2, 8, 3, 1, 5, 4, 9, 7, 6, 10])}
Best estimator:
RandomForestClassifier(bootstrap=False, max_depth=30, n_estimators=340)
Best hyperparameters:
{'n_estimators': 340, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 30, 'bootstrap': False}
Tuned Model for Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
tuned_rf = RandomForestClassifier(n_estimators = 340, min_samples_split = 2, min_samples_leaf = 1, max_features='auto', max_depth = 30, bootstrap = 'False')
tuned_rf.fit(X_train, y_train)
# Predicting the Test set results
y_pred = tuned_rf.predict(X_test)
#10-fold cross validation score
cv = cross_val_score(estimator = rf, X = X_train, y = y_train, cv =3)
tuned_rf_cv = cv.mean()
print("10 fold cross validation :", rf_cv)
from sklearn.metrics import accuracy_score
tuned_rf_ac = accuracy_score(y_pred,y_test)
print("accuracy :", rf_ac)
from sklearn.metrics import precision_score
tuned_rf_p = precision_score(y_pred,y_test,average='weighted')
print("precision :", rf_p)
from sklearn.metrics import recall_score
tuned_rf_r = recall_score(y_pred,y_test,average='weighted')
print("recall :", rf_r)
from sklearn.metrics import f1_score
tuned_rf_f1 = f1_score(y_test, y_pred,average='weighted')
print("F1 Score :", rf_f1)
10 fold cross validation : 0.901094822787159 accuracy : 0.8739371534195933 precision : 0.8749279444562226 recall : 0.8739371534195933 F1 Score : 0.8738406313696206
Decision Tree Classifier Base Model
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
classifier = DecisionTreeClassifier(min_samples_leaf = 6, max_features=7, max_depth= None, criterion = 'gini')
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#10-fold cross validation score
cv = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv =3)
dc_cv = cv.mean()
print("10 fold cross validation :", dc_cv)
from sklearn.metrics import accuracy_score
dc_ac = accuracy_score(y_pred,y_test)
print("accuracy :", dc_ac)
from sklearn.metrics import precision_score
dc_p = precision_score(y_pred,y_test,average='weighted')
print("precision :", dc_p)
from sklearn.metrics import recall_score
dc_r = recall_score(y_pred,y_test,average='weighted')
print("recall :", dc_r)
from sklearn.metrics import f1_score
dc_f1 = f1_score(y_test, y_pred,average='weighted')
print("F1 Score :", dc_f1)
10 fold cross validation : 0.8485804416403786 accuracy : 0.8083179297597043 precision : 0.8098093342521533 recall : 0.8083179297597043 F1 Score : 0.8077182156541395
Hyperparameter Tuning of the Decision Tree Classifier
param_dist = {"max_depth": [3, None],
"max_features": range(1, 9),
"min_samples_leaf": range(1, 9),
"criterion": ["gini", "entropy"]}
random_search = RandomizedSearchCV(estimator = classifier, param_distributions = param_dist, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1)
random_search.fit(X_train, y_train)
Fitting 5 folds for each of 10 candidates, totalling 50 fits
RandomizedSearchCV(cv=5,
estimator=DecisionTreeClassifier(max_features=7,
min_samples_leaf=6),
n_jobs=-1,
param_distributions={'criterion': ['gini', 'entropy'],
'max_depth': [3, None],
'max_features': range(1, 9),
'min_samples_leaf': range(1, 9)},
random_state=42, verbose=2)
Printing the Best Hyperparameter Values
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
All results:
{'mean_fit_time': array([0.09387021, 0.03995581, 0.05808277, 0.06206493, 0.09689026,
0.06814876, 0.12480278, 0.06617355, 0.04326472, 0.04342818]), 'std_fit_time': array([0.00633166, 0.00222616, 0.00562742, 0.00916182, 0.00684135,
0.0154164 , 0.00382658, 0.00929117, 0.00565482, 0.00473721]), 'mean_score_time': array([0.00598259, 0.00598383, 0.00615144, 0.00601416, 0.00618849,
0.005024 , 0.00595036, 0.00544682, 0.00515532, 0.00457697]), 'std_score_time': array([1.25584364e-03, 8.98739154e-04, 4.80861821e-04, 6.38994529e-05,
9.83500753e-04, 4.16537562e-04, 5.33382905e-04, 4.90025403e-04,
2.11352120e-04, 9.33533948e-04]), 'param_min_samples_leaf': masked_array(data=[5, 7, 8, 7, 6, 2, 3, 8, 2, 7],
mask=[False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_max_features': masked_array(data=[5, 1, 2, 2, 7, 8, 7, 5, 2, 4],
mask=[False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_max_depth': masked_array(data=[None, 3, None, None, None, 3, None, 3, 3, 3],
mask=[False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_criterion': masked_array(data=['entropy', 'gini', 'gini', 'entropy', 'gini',
'entropy', 'entropy', 'entropy', 'gini', 'gini'],
mask=[False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'params': [{'min_samples_leaf': 5, 'max_features': 5, 'max_depth': None, 'criterion': 'entropy'}, {'min_samples_leaf': 7, 'max_features': 1, 'max_depth': 3, 'criterion': 'gini'}, {'min_samples_leaf': 8, 'max_features': 2, 'max_depth': None, 'criterion': 'gini'}, {'min_samples_leaf': 7, 'max_features': 2, 'max_depth': None, 'criterion': 'entropy'}, {'min_samples_leaf': 6, 'max_features': 7, 'max_depth': None, 'criterion': 'gini'}, {'min_samples_leaf': 2, 'max_features': 8, 'max_depth': 3, 'criterion': 'entropy'}, {'min_samples_leaf': 3, 'max_features': 7, 'max_depth': None, 'criterion': 'entropy'}, {'min_samples_leaf': 8, 'max_features': 5, 'max_depth': 3, 'criterion': 'entropy'}, {'min_samples_leaf': 2, 'max_features': 2, 'max_depth': 3, 'criterion': 'gini'}, {'min_samples_leaf': 7, 'max_features': 4, 'max_depth': 3, 'criterion': 'gini'}], 'split0_test_score': array([0.8475572 , 0.70655535, 0.82529375, 0.83147805, 0.84632035,
0.75881262, 0.84353741, 0.70810142, 0.73809524, 0.74737168]), 'split1_test_score': array([0.84972171, 0.70562771, 0.82560297, 0.83920841, 0.84910328,
0.71428571, 0.83426098, 0.7563389 , 0.75170068, 0.76777984]), 'split2_test_score': array([0.83854006, 0.68017321, 0.83544695, 0.82245592, 0.83544695,
0.8097742 , 0.85029384, 0.75069595, 0.73182802, 0.71326941]), 'split3_test_score': array([0.84658212, 0.67553356, 0.83823075, 0.81626972, 0.85709867,
0.7884318 , 0.84936591, 0.7915249 , 0.69007114, 0.73430251]), 'split4_test_score': array([0.84441695, 0.64924219, 0.82709558, 0.82554903, 0.85740798,
0.73430251, 0.85524281, 0.74822147, 0.7339932 , 0.77729663]), 'mean_test_score': array([0.84536361, 0.6834264 , 0.830334 , 0.82699222, 0.84907545,
0.76112137, 0.84654019, 0.75097653, 0.72913766, 0.74800401]), 'std_test_score': array([0.00381436, 0.02130393, 0.00541803, 0.0078357 , 0.00808864,
0.03473073, 0.00717783, 0.02652874, 0.02071876, 0.02299463]), 'rank_test_score': array([ 3, 10, 4, 5, 1, 6, 2, 7, 9, 8])}
Best estimator:
DecisionTreeClassifier(max_features=7, min_samples_leaf=6)
Best hyperparameters:
{'min_samples_leaf': 6, 'max_features': 7, 'max_depth': None, 'criterion': 'gini'}
Tuned Model for Decision Tree Classifier
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
tuned_classifier = DecisionTreeClassifier(min_samples_leaf = 6, max_features=7, max_depth= None, criterion = 'gini')
tuned_classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = tuned_classifier.predict(X_test)
#10-fold cross validation score
cv = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv =3)
tuned_dc_cv = cv.mean()
print("10 fold cross validation :", dc_cv)
from sklearn.metrics import accuracy_score
tuned_dc_ac = accuracy_score(y_pred,y_test)
print("accuracy :", dc_ac)
from sklearn.metrics import precision_score
tuned_dc_p = precision_score(y_pred,y_test,average='weighted')
print("precision :", dc_p)
from sklearn.metrics import recall_score
tuned_dc_r = recall_score(y_pred,y_test,average='weighted')
print("recall :", dc_r)
from sklearn.metrics import f1_score
tuned_dc_f1 = f1_score(y_test, y_pred,average='weighted')
print("F1 Score :", dc_f1)
10 fold cross validation : 0.8485804416403786 accuracy : 0.8083179297597043 precision : 0.8098093342521533 recall : 0.8083179297597043 F1 Score : 0.8077182156541395
def eval_result(rf, X_test, y_test, validation = False):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
pred = rf.predict(X_test)
print(classification_report(y_test, pred, target_names = ['Forward', 'Midfielder', 'Defender', 'Goalkeeper']))
display(pd.DataFrame(confusion_matrix(y_test, pred),
columns = ['Predicted Forward', 'Predicted Midfielder', 'Predicted Defender', 'Predicted Goalkeeper'],
index = ['Forward', 'Midfielder', 'Defender', 'Goalkeeper']))
if hasattr(rf, 'feature_importances_') and not validation:
features = pd.DataFrame({
'Variable' :X_test.columns,
'Importance':rf.feature_importances_
})
features.sort_values('Importance', ascending=False, inplace=True)
display(features.head(20))
eval_result(rf, X_test, y_test, validation = True)
precision recall f1-score support
Forward 0.92 0.92 0.92 1713
Midfielder 0.85 0.79 0.82 1153
Defender 1.00 1.00 1.00 505
Goalkeeper 0.82 0.86 0.84 2039
accuracy 0.87 5410
macro avg 0.90 0.89 0.89 5410
weighted avg 0.87 0.87 0.87 5410
| Predicted Forward | Predicted Midfielder | Predicted Defender | Predicted Goalkeeper | |
|---|---|---|---|---|
| Forward | 1568 | 0 | 0 | 145 |
| Midfielder | 4 | 909 | 0 | 240 |
| Defender | 0 | 0 | 505 | 0 |
| Goalkeeper | 134 | 159 | 0 | 1746 |
# displaying confusion matrix and heat map to find correlation
cm = confusion_matrix(y_test,rf.predict(X_test))
sns.heatmap(cm,annot=True,fmt="d")
<AxesSubplot:>
def eval_result(tuned_rf, X_test, y_test, validation = False):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
pred = tuned_rf.predict(X_test)
print(classification_report(y_test, pred, target_names = ['Forward', 'Midfielder', 'Defender', 'Goalkeeper']))
display(pd.DataFrame(confusion_matrix(y_test, pred),
columns = ['Predicted Forward', 'Predicted Midfielder', 'Predicted Defender', 'Predicted Goalkeeper'],
index = ['Forward', 'Midfielder', 'Defender', 'Goalkeeper']))
if hasattr(tuned_rf, 'feature_importances_') and not validation:
features = pd.DataFrame({
'Variable' :X_test.columns,
'Importance':tuned_rf.feature_importances_
})
features.sort_values('Importance', ascending=False, inplace=True)
display(features.head(20))
eval_result(tuned_rf, X_test, y_test, validation = True)
precision recall f1-score support
Forward 0.92 0.92 0.92 1713
Midfielder 0.85 0.79 0.82 1153
Defender 1.00 1.00 1.00 505
Goalkeeper 0.82 0.85 0.84 2039
accuracy 0.87 5410
macro avg 0.90 0.89 0.89 5410
weighted avg 0.87 0.87 0.87 5410
| Predicted Forward | Predicted Midfielder | Predicted Defender | Predicted Goalkeeper | |
|---|---|---|---|---|
| Forward | 1569 | 0 | 0 | 144 |
| Midfielder | 5 | 911 | 0 | 237 |
| Defender | 0 | 0 | 505 | 0 |
| Goalkeeper | 134 | 162 | 0 | 1743 |
# displaying confusion matrix and heat map to find correlation
cm = confusion_matrix(y_test,tuned_rf.predict(X_test))
sns.heatmap(cm,annot=True,fmt="d")
<AxesSubplot:>
Evaluation Summary Table for 4 Major Positions Classifier Base Model
accuracy = [SVC_ac,log_ac,rf_ac,dc_ac]
ten_fold_cross_validation = [SVC_cv,log_cv,rf_cv,dc_cv]
f1_score = [SVC_f1,log_f1,rf_f1,dc_f1]
precision = [SVC_p,log_p,rf_p,dc_p]
recall = [SVC_r,log_r,rf_r,dc_r]
models = DataFrame({'Accuracy': accuracy, '10 fold cv': ten_fold_cross_validation,
"f1 score": f1_score,'precision': precision,
'recall': recall})
models.index = ['Support Vector Classifier','Logistic Regression Classifier','Random Forest Classifier','Decision Tree Classifier']
models
| Accuracy | 10 fold cv | f1 score | precision | recall | |
|---|---|---|---|---|---|
| Support Vector Classifier | 0.860444 | 0.890951 | 0.860279 | 0.861062 | 0.860444 |
| Logistic Regression Classifier | 0.865989 | 0.891507 | 0.866097 | 0.866087 | 0.865989 |
| Random Forest Classifier | 0.873937 | 0.901095 | 0.873841 | 0.874928 | 0.873937 |
| Decision Tree Classifier | 0.808318 | 0.848580 | 0.807718 | 0.809809 | 0.808318 |
Evaluation Summary Graph for 4 Major Positions Classifier Base Model
models1 = DataFrame({'Accuracy' : models.unstack()}).reset_index()
# plot accuracies
plt.figure(figsize=(8, 7))
fig_models = sns.barplot(x='level_0', y='Accuracy', hue='level_1', data=models1);
fig_models.set(xlabel='Evaluation Metrics', ylabel='Evaluation Metrics Value');
fig_models.set_title('The Accuracy of All Ensemble Models Over Five Evaluation Metrics');
Evaluation Summary Table for 4 Major Positions Classifier Tuned Model
accuracy = [tuned_SVC_ac,tuned_log_ac,tuned_rf_ac,tuned_dc_ac]
ten_fold_cross_validation = [tuned_SVC_cv,tuned_log_cv,tuned_rf_cv,tuned_dc_cv]
f1_score = [tuned_SVC_f1,tuned_log_f1,tuned_rf_f1,tuned_dc_f1]
precision = [tuned_SVC_p,tuned_log_p,tuned_rf_p,tuned_dc_p]
recall = [tuned_SVC_r,tuned_log_r,tuned_rf_r,tuned_dc_r]
models = DataFrame({'Accuracy': accuracy, '10 fold cv': ten_fold_cross_validation,
"f1 score": f1_score,'precision': precision,
'recall': recall})
models.index = ['Support Vector Classifier','Logistic Regression Classifier','Random Forest Classifier','Decision Tree Classifier']
models
| Accuracy | 10 fold cv | f1 score | precision | recall | |
|---|---|---|---|---|---|
| Support Vector Classifier | 0.868022 | 0.890951 | 0.868052 | 0.868572 | 0.868022 |
| Logistic Regression Classifier | 0.867837 | 0.891507 | 0.867936 | 0.867925 | 0.867837 |
| Random Forest Classifier | 0.873937 | 0.900786 | 0.873835 | 0.874838 | 0.873937 |
| Decision Tree Classifier | 0.810351 | 0.838498 | 0.810158 | 0.810781 | 0.810351 |
Evaluation Summary Graph for 4 Major Positions Classifier Tuned Model
models1 = DataFrame({'Accuracy' : models.unstack()}).reset_index()
# plot accuracies
plt.figure(figsize=(8, 7))
fig_models = sns.barplot(x='level_0', y='Accuracy', hue='level_1', data=models1);
fig_models.set(xlabel='Evaluation Metrics', ylabel='Evaluation Metrics Value');
fig_models.set_title('The Accuracy of All Ensemble Models Over Five Evaluation Metrics');